Visualise some aspect of the data you find interesting, e.g., the average number of free throws per period for the regular season and the playoffs.
Load analysis packages.
library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag(): dplyr, stats
library(MangoTest)
library(ggridges)
library(plotly)
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
Load in the data and explore structure. Data lists the play by play scores in multiple games accross several seasons.
str(free_throws)
#> Classes 'tbl_df', 'tbl' and 'data.frame': 618019 obs. of 11 variables:
#> $ end_result: chr "106 - 114" "106 - 114" "106 - 114" "106 - 114" ...
#> $ game : chr "PHX - LAL" "PHX - LAL" "PHX - LAL" "PHX - LAL" ...
#> $ game_id : num 2.61e+08 2.61e+08 2.61e+08 2.61e+08 2.61e+08 ...
#> $ period : num 1 1 1 1 1 1 1 2 2 2 ...
#> $ play : chr "Andrew Bynum makes free throw 1 of 2" "Andrew Bynum makes free throw 2 of 2" "Andrew Bynum makes free throw 1 of 2" "Andrew Bynum misses free throw 2 of 2" ...
#> $ player : chr "Andrew Bynum" "Andrew Bynum" "Andrew Bynum" "Andrew Bynum" ...
#> $ playoffs : chr "regular" "regular" "regular" "regular" ...
#> $ score : chr "0 - 1" "0 - 2" "18 - 12" "18 - 12" ...
#> $ season : chr "2006 - 2007" "2006 - 2007" "2006 - 2007" "2006 - 2007" ...
#> $ shot_made : int 1 1 1 0 1 1 1 0 1 1 ...
#> $ time :Classes 'hms', 'difftime' atomic [1:618019] 42300 42300 26760 26760 26280 ...
#> .. ..- attr(*, "units")= chr "secs"
#> - attr(*, "spec")=List of 2
#> ..$ cols :List of 11
#> .. ..$ end_result: list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ game : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ game_id : list()
#> .. .. ..- attr(*, "class")= chr "collector_double" "collector"
#> .. ..$ period : list()
#> .. .. ..- attr(*, "class")= chr "collector_double" "collector"
#> .. ..$ play : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ player : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ playoffs : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ score : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ season : list()
#> .. .. ..- attr(*, "class")= chr "collector_character" "collector"
#> .. ..$ shot_made : list()
#> .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
#> .. ..$ time :List of 1
#> .. .. ..$ format: chr ""
#> .. .. ..- attr(*, "class")= chr "collector_time" "collector"
#> ..$ default: list()
#> .. ..- attr(*, "class")= chr "collector_guess" "collector"
#> ..- attr(*, "class")= chr "col_spec"
Split out teams, and scores in each game.
tidy_free_throws <- free_throws %>%
separate(game, c("team_1", "team_2"), " - ") %>%
separate(score, c("score_1", "score_2"), " - ") %>%
separate(end_result, c("end_result_1", "end_result_2"), " - ")
Seperate game wide information, play information and scores information. Convert to factors as appropriate.
games_df <- tidy_free_throws %>%
select(game_id, team_1, team_2, playoffs,
season, end_result_1, end_result_2) %>%
unique %>%
mutate(draw = ifelse(end_result_1 == end_result_2, "Yes", "No")) %>%
gather(key = "order", value = "team", team_1, team_2) %>%
separate(order, c("tmp", "order"), "_") %>%
select(-tmp) %>%
gather(key = "order2", value = "end_result",
end_result_1, end_result_2) %>%
separate(order2, c("tmp", "order2"), "result_") %>%
select(-tmp) %>%
filter(order == order2) %>%
select(-order2) %>%
mutate_at(.vars = vars(order, team, season, playoffs),
.funs = funs(factor(.))) %>%
mutate(end_result = as.numeric(end_result)) %>%
mutate(season = factor(season, levels = rev(paste0(2006:2015, " - ", 2007:2016))))
plays_df <- tidy_free_throws %>%
select(game_id, period, play, player, shot_made, time) %>%
mutate(player = factor(player))
scores_df <- tidy_free_throws %>%
select(game_id, score_1, score_2, period, time) %>%
gather(key = "order", value = "score", score_1, score_2) %>%
separate(order, c("tmp", "order"), "_") %>%
select(-tmp) %>%
mutate(order = factor(order)) %>%
mutate(score = as.numeric(score))
Add game outcome variable.
games_df <- games_df %>%
group_by(game_id) %>%
arrange(desc(end_result), .by_group = TRUE) %>%
mutate(outcome = c("Won", "Lost")) %>%
ungroup() %>%
mutate(outcome = ifelse(draw %in% "Yes", "Draw", outcome)) %>%
mutate(outcome = factor(outcome)) %>%
select(-draw)
Summarise each dataset in turn. No draws in the dataset - online search suggests NBA games are played until a team wins.
summary(games_df)
#> game_id playoffs season order
#> Min. :261031013 playoffs: 1672 2013 - 2014:2634 1:12874
#> 1st Qu.:290119029 regular :24076 2008 - 2009:2632 2:12874
#> Median :310410028 2015 - 2016:2626
#> Mean :336085907 2009 - 2010:2626
#> 3rd Qu.:400489596 2010 - 2011:2624
#> Max. :400878160 2012 - 2013:2622
#> (Other) :9984
#> team end_result outcome
#> SA : 934 Min. : 54.00 Lost:12874
#> MIA : 918 1st Qu.: 91.00 Won :12874
#> BOS : 910 Median : 99.00
#> CLE : 902 Mean : 99.55
#> LAL : 901 3rd Qu.:107.00
#> ATL : 886 Max. :168.00
#> (Other):20297
summary(plays_df)
#> game_id period play
#> Min. :261031013 Min. :1.000 Length:618019
#> 1st Qu.:281226023 1st Qu.:2.000 Class :character
#> Median :310306001 Median :3.000 Mode :character
#> Mean :333936881 Mean :2.696
#> 3rd Qu.:400489501 3rd Qu.:4.000
#> Max. :400878160 Max. :8.000
#>
#> player shot_made time
#> LeBron James : 8001 Min. :0.0000 Length:618019
#> Dwight Howard : 7728 1st Qu.:1.0000 Class1:hms
#> Kevin Durant : 6030 Median :1.0000 Class2:difftime
#> Dwyane Wade : 5594 Mean :0.7568 Mode :numeric
#> Kobe Bryant : 5594 3rd Qu.:1.0000
#> Carmelo Anthony: 5318 Max. :1.0000
#> (Other) :579754
summary(scores_df)
#> game_id period time order
#> Min. :261031013 Min. :1.000 Length:1236038 1:618019
#> 1st Qu.:281226023 1st Qu.:2.000 Class1:hms 2:618019
#> Median :310306001 Median :3.000 Class2:difftime
#> Mean :333936881 Mean :2.696 Mode :numeric
#> 3rd Qu.:400489501 3rd Qu.:4.000
#> Max. :400878160 Max. :8.000
#> score
#> Min. : 0.00
#> 1st Qu.: 32.00
#> Median : 57.00
#> Mean : 56.47
#> 3rd Qu.: 81.00
#> Max. :166.00
games_df %>%
ggplot(aes(x = end_result, y = season, fill = outcome)) +
geom_density_ridges(alpha = 0.6) +
theme_minimal() +
scale_fill_viridis_d() +
guides(fill = guide_legend(title = "Game Outcome")) +
labs(x = "Final Score",
y = "Season",
title = "Distribution of NBA Final Scores",
subtitle = "By Game Outcome from 2006 until 2016",
caption = "By Sam Abbott, for Mango Solutions. Source: NBA Free Throws")
#> Picking joint bandwidth of 2.26
scores_df %>%
left_join(games_df %>%
select(game_id, order, team, outcome, season),
by = c("game_id", "order")) %>%
mutate(period = factor(period)) %>%
group_by(team, period, game_id) %>%
arrange(desc(score), .by_group = TRUE) %>%
slice(1) %>%
ungroup %>%
ggplot(aes(x = score, y = period,
fill = outcome)) +
geom_density_ridges(alpha = 0.6) +
scale_fill_viridis_d() +
theme_minimal() +
guides(fill = guide_legend(title = "Game Outcome")) +
labs(x = "Score",
y = "Period",
title = "Distribution of NBA Scores during Play",
subtitle = "By Game Outcome for all Teams: 2006 - 2016",
caption = "By Sam Abbott, for Mango Solutions. Source: NBA Free Throws")
#> Picking joint bandwidth of 2.27
conv_player_df <- plays_df %>%
left_join(games_df %>%
select(game_id, season) %>%
unique,
by = "game_id") %>%
group_by(player, season) %>%
summarise(conversion = mean(shot_made), shots = n()) %>%
mutate(conversion = round(conversion, digits = 3)) %>%
ungroup
conv_player_df %>%
plot_ly(y = ~conversion*100, x = ~shots,
text = ~player,
frame = ~season) %>%
add_markers() %>%
layout(yaxis = list(title = "Conversion Rate", range = c(0, 110))) %>%
layout(yaxis = list(ticksuffix = "%")) %>%
layout(xaxis = list(title = "Free Throws")) %>%
layout(title = "NBA Free Throw Conversion Rate") %>%
animation_opts(2000, redraw = FALSE, easing = "elastic") %>%
animation_slider(currentvalue = list(prefix = "Season: ")) %>%
hide_legend()